df %>% select(`Company response to consumer`,year) %>% 
  group_by(`Company response to consumer`, year) %>% 
  summarise(number_of_complaints = n()) %>% 
  arrange(desc(number_of_complaints))
## # A tibble: 54 x 3
## # Groups:   Company response to consumer [9]
##    `Company response to consumer`   year number_of_complaints
##    <chr>                           <int>                <int>
##  1 Closed with explanation          2019               220449
##  2 Closed with explanation          2018               209346
##  3 Closed with explanation          2017               202692
##  4 Closed with explanation          2016               151698
##  5 Closed with explanation          2015               131018
##  6 Closed with explanation          2014               118186
##  7 Closed with explanation          2013                82032
##  8 Closed with non-monetary relief  2019                41792
##  9 Closed with explanation          2012                36116
## 10 Closed with non-monetary relief  2018                34600
## # … with 44 more rows

There is no seasonable or periodic pattern behind these complaints.



year_tend <- function(company_name){
  title <- paste("Complaints of ", company_name, 
                           " throughout 2019")
  
  p <- df %>% 
  filter(year == '2019') %>% 
  filter(Company == company_name) %>% 
  mutate(date = as.POSIXct(paste(month , day , sep = "." ), 
                           format = "%m.%d" )) %>%
  group_by(date, `Company response to consumer`) %>%
  summarise(number_of_complaints = n())

  pp <- ggplot(p, aes(x = date, y = number_of_complaints,
             color = `Company response to consumer`)) +
  geom_line()+
  ylab("Number of complaints") +
  theme_tufte() +
  scale_x_datetime(labels= date_format("%b"), 
                   date_breaks = '1 month') + 
  scale_y_log10() +
  theme(plot.title = element_text(size=14, face="bold", hjust = 0.5),
        legend.text = element_text(size=8),
        legend.title = element_text(size=8),
        axis.text.x = element_text(angle = 45)) +
  ggtitle(title)
  
  return(ggplotly(pp))
}
year_tend("EQUIFAX, INC.")
year_tend_2 <- function(company_name){
  title <- paste("Complaints of ", company_name, 
                           " throughout 2019")
  
  p <- df %>% 
  filter(year == '2019') %>% 
  filter(Company == company_name) %>% 
  mutate(date = as.POSIXct(paste(month , day , sep = "." ), 
                           format = "%m.%d" )) %>%
  group_by(date, `Company response to consumer`) %>%
  summarise(number_of_complaints = n())

  pp <- ggplot(p, aes(x = date, y = number_of_complaints,
             color = `Company response to consumer`)) +
  geom_line()+
  ylab("Number of complaints") +
  theme_tufte() +
  scale_x_datetime(labels= date_format("%b"), 
                   date_breaks = '1 month') + 
  theme(plot.title = element_text(size=14, face="bold", hjust = 0.5),
        legend.text = element_text(size=8),
        legend.title = element_text(size=8),
        axis.text.x = element_text(angle = 45)) +
  ggtitle(title)
  
  return(ggplotly(pp))
}
year_tend_2("BANK OF AMERICA, NATIONAL ASSOCIATION")



DT <- function(company_name){
  df_new <- df %>% 
    filter(Company == company_name) %>% 
    mutate(date = as.POSIXct(paste(month , day , sep = "." ), 
                             format = "%m.%d" )) %>%
    group_by(date, Product, State, 
             `Submitted via`, `Company response to consumer`) %>%
    summarise(number_of_complaints = n()) %>% 
    arrange(desc(number_of_complaints)) %>% 
    head(100)
  
  pretty_headers <- 
    gsub("[.]", " ", colnames(df_new)) %>%
    str_to_title()
  
  title <- paste('Table 1: This is a simple data table for the complaints of ', company_name," .")
  
  
  DT <- df_new %>%
    datatable(
      caption = title,
      rownames = FALSE,
      class = 'cell-border stripe',
      colnames = pretty_headers,
      filter = list(position = "top"),
      options = list(
        dom = "Bfrtip",
        buttons = I("colvis"),
        language = list(sSearch = "Filter:")
      ),
      extensions = c("Buttons", "Responsive")
    )
  return(DT)
}
DT("EQUIFAX, INC.")



Machine Learning Trail #1

df_ml <- df_raw %>% 
  filter(df_raw$Company == "BANK OF AMERICA, NATIONAL ASSOCIATION") %>% 
  select(Product, State, `Company response to consumer`,
         `Company public response`, 
         `Consumer complaint narrative`, `Consumer consent provided?`)
df_ml$`Company response to consumer` <- ifelse(df_ml$`Company response to consumer` == "Closed with monetary relief", 1, 0)
df_ml$`Consumer consent provided?` <- ifelse(df_ml$`Consumer consent provided?` == "Consent provided", 1, 0)
df_ml$`Consumer complaint narrative` <- ifelse(df_ml$`Consumer complaint narrative` == 'NA', 0, 1)
df_ml$`Consumer complaint narrative`[is.na(df_ml$`Consumer complaint narrative`)] <- 0
df_ml$`Company public response` <- ifelse(df_ml$`Company public response` == 'NA', 0, 1)
df_ml$`Company public response`[is.na(df_ml$`Company public response`)] <- 0
df_ml <- df_ml %>% 
  filter(is.na(`Company public response`) == FALSE) %>% 
  filter(is.na(`Consumer complaint narrative`) == FALSE) %>% 
  filter(is.na(`Consumer consent provided?`) == FALSE) 
df_ml$`Company response to consumer` <- 
  factor(df_ml$`Company response to consumer`,                                    labels = c("withMonetaryRelief",                                                      "noMonetaryRelief"), 
                     levels = 1:0) 

set.seed(12345)
in_train <- createDataPartition(y = df_ml$`Company response to consumer`, 
                                p = 0.8, list = FALSE)
training <- df_ml[ in_train, ]
testing  <- df_ml[-in_train, ]

Random Forest - Supervised learning

rf <- randomForest(`Company response to consumer` ~ 
                    as.factor(training$Product) + 
                    training$`Company response to consumer` +
                    training$`Company public response` +
                    training$`Consumer complaint narrative` +
                    training$`Consumer consent provided?`, 
                  data=training,
                  importance = TRUE,
                  na.action = na.omit)
y_hat_rf <- predict(rf, newdata = testing,
                 type = "response", na.action = na.pass)
## Error in x[...] <- m: NAs are not allowed in subscripted assignments
confusionMatrix(y_hat_rf, reference = testing$flow_type_dummy)
## Error in confusionMatrix(y_hat_rf, reference = testing$flow_type_dummy): object 'y_hat_rf' not found

LDA - Unsupervised learning

LDA <- train(`Company response to consumer` ~ 
                    as.factor(Product) + 
                    `Company response to consumer` +
                    `Company public response` +
                    `Consumer complaint narrative` +
                    `Consumer consent provided?`, 
             data = training, method = "lda", 
             preProcess = c("center", "scale"))

z_LDA <- predict(LDA, newdata = testing)
## Error in predict.lda(modelFit, newdata): wrong number of variables
confusionMatrix(z_LDA, reference = testing$flow_type_dummy)
## Error in confusionMatrix(z_LDA, reference = testing$flow_type_dummy): object 'z_LDA' not found

Custering - Semi-supervised learning